Plan: - Predict released movie revenue - Predictors: - adult - popularity - budget - release date -> season - runtime - genres - casts (and their popularity?) - non-casts (and their popularity?)
!install.packages(c("httr", "dplyr", "jsonlite", "progress", "dplyr", "tidyr", "ggplot2", "knitr", "ggcorrplot", "stringr", "xgboost", "lubridate", "randomForest", "forcats", "plotly"))
library(httr)
library(dplyr)
library(jsonlite)
library(progress)
library(dplyr)
library(tidyr)
library(ggplot2)
library(knitr)
library(ggcorrplot)
library(stringr)
library(xgboost)
library(lubridate)
library(randomForest)
library(forcats)
library(xgboost)
library(plotly)
set.seed(42)
API_KEY <- "e6d34aa453d1a71af2b6e77848923e57"
BASE_URL <- "https://api.themoviedb.org/3/"
DISCOVER_MOVIE_PATH <- "discover/movie"
MOVIE_PATH <- "movie/"
MOVIES_COUNT <- 1000
MOVIES_PER_PAGE <- 20
response <- GET(
url = paste0(BASE_URL, DISCOVER_MOVIE_PATH),
query = list(
sort_by = "revenue.desc",
release_date.lte = "2025-04-25",
api_key = API_KEY
)
)
movie_total_results <- content(response)$total_results
movie_total_pages <- content(response)$total_pages
cat("There are", movie_total_results, "movies in total provided by the API.\n")
## There are 900253 movies in total provided by the API.
cat("These movies are paginated into", movie_total_pages, "pages", "with", MOVIES_PER_PAGE, "movies per page.\n")
## These movies are paginated into 45013 pages with 20 movies per page.
# response <- GET(
# url = paste0(BASE_URL, DISCOVER_MOVIE_PATH),
# query = list(
# sort_by = "revenue.desc",
# release_date.lte = "2025-04-25",
# api_key = API_KEY
# )
# )
#
# movie_total_results <- content(response)$total_results
# movie_total_pages <- content(response)$total_pages
#
# cat("There are", movie_total_results, "movies in total provided by the API.\n")
# cat("These movies are paginated into", movie_total_pages, "pages", "with", MOVIES_PER_PAGE, "movies per page.\n")
movies <- data.frame()
n_pages <- ceiling(MOVIES_COUNT / MOVIES_PER_PAGE)
pb <- txtProgressBar(min = 0, max = n_pages, style = 3)
## | | | 0%
for (page in 1:n_pages) {
response <- GET(
url = paste0(BASE_URL, DISCOVER_MOVIE_PATH),
query = list(
sort_by = "revenue.desc",
page = page,
api_key = API_KEY
)
)
# Check if API request was successful
if (status_code(response) != 200) {
print(paste("Skipping page", page, "- API error:", status_code(response)))
next
}
data <- fromJSON(content(response, as = "text"))$results
page_movies <- data.frame(
movie_id = data$id,
title = data$title
)
movies <- rbind(movies, page_movies)
setTxtProgressBar(pb, page)
}
## | |= | 2% | |=== | 4% | |==== | 6% | |====== | 8% | |======= | 10% | |======== | 12% | |========== | 14% | |=========== | 16% | |============= | 18% | |============== | 20% | |=============== | 22% | |================= | 24% | |================== | 26% | |==================== | 28% | |===================== | 30% | |====================== | 32% | |======================== | 34% | |========================= | 36% | |=========================== | 38% | |============================ | 40% | |============================= | 42% | |=============================== | 44% | |================================ | 46% | |================================== | 48% | |=================================== | 50% | |==================================== | 52% | |====================================== | 54% | |======================================= | 56% | |========================================= | 58% | |========================================== | 60% | |=========================================== | 62% | |============================================= | 64% | |============================================== | 66% | |================================================ | 68% | |================================================= | 70% | |================================================== | 72% | |==================================================== | 74% | |===================================================== | 76% | |======================================================= | 78% | |======================================================== | 80% | |========================================================= | 82% | |=========================================================== | 84% | |============================================================ | 86% | |============================================================== | 88% | |=============================================================== | 90% | |================================================================ | 92% | |================================================================== | 94% | |=================================================================== | 96% | |===================================================================== | 98% | |======================================================================| 100%
close(pb)
cat("Successfully fetch all movie ids and titles!")
## Successfully fetch all movie ids and titles!
pb <- txtProgressBar(min = 0, max = nrow(movies), style = 3)
## | | | 0%
all_genres <- c()
for (i in 1:nrow(movies)) {
movie_id <- movies$movie_id[i]
response <- GET(
url = paste0(BASE_URL, MOVIE_PATH, movie_id),
query = list(
api_key = API_KEY
)
)
# Check if API request was successful
if (status_code(response) != 200) {
print(paste("Skipping movie", movie_id, "- API error:", status_code(response)))
next
}
data <- fromJSON(content(response, as = "text"))
movies$revenue[i] <- data$revenue
movies$adult[i] <- as.logical(data$adult)
movies$popularity[i] <- data$popularity
movies$budget[i] <- data$budget
movies$release_date[i] <- data$release_date
movies$runtime[i] <- data$runtime
genres <- data$genres
movies$genres[i] <- paste(genres$name, collapse = ",")
for (genre in genres$name) {
all_genres <- unique(c(all_genres, genre))
}
setTxtProgressBar(pb, i)
}
## | | | 1% | |= | 1% | |= | 2% | |== | 2% | |== | 3% | |== | 4% | |=== | 4% | |=== | 5% | |==== | 5% | |==== | 6% | |===== | 6% | |===== | 7% | |===== | 8% | |====== | 8% | |====== | 9% | |======= | 9% | |======= | 10% | |======= | 11% | |======== | 11% | |======== | 12% | |========= | 12% | |========= | 13% | |========= | 14% | |========== | 14% | |========== | 15% | |=========== | 15% | |=========== | 16% | |============ | 16% | |============ | 17% | |============ | 18% | |============= | 18% | |============= | 19% | |============== | 19% | |============== | 20% | |============== | 21% | |=============== | 21% | |=============== | 22% | |================ | 22% | |================ | 23% | |================ | 24% | |================= | 24% | |================= | 25% | |================== | 25% | |================== | 26% | |=================== | 26% | |=================== | 27% | |=================== | 28% | |==================== | 28% | |==================== | 29% | |===================== | 29% | |===================== | 30% | |===================== | 31% | |====================== | 31% | |====================== | 32% | |======================= | 32% | |======================= | 33% | |======================= | 34% | |======================== | 34% | |======================== | 35% | |========================= | 35% | |========================= | 36% | |========================== | 36% | |========================== | 37% | |========================== | 38% | |=========================== | 38% | |=========================== | 39% | |============================ | 39% | |============================ | 40% | |============================ | 41% | |============================= | 41% | |============================= | 42% | |============================== | 42% | |============================== | 43% | |============================== | 44% | |=============================== | 44% | |=============================== | 45% | |================================ | 45% | |================================ | 46% | |================================= | 46% | |================================= | 47% | |================================= | 48% | |================================== | 48% | |================================== | 49% | |=================================== | 49% | |=================================== | 50% | |=================================== | 51% | |==================================== | 51% | |==================================== | 52% | |===================================== | 52% | |===================================== | 53% | |===================================== | 54% | |====================================== | 54% | |====================================== | 55% | |======================================= | 55% | |======================================= | 56% | |======================================== | 56% | |======================================== | 57% | |======================================== | 58% | |========================================= | 58% | |========================================= | 59% | |========================================== | 59% | |========================================== | 60% | |========================================== | 61% | |=========================================== | 61% | |=========================================== | 62% | |============================================ | 62% | |============================================ | 63% | |============================================ | 64% | |============================================= | 64% | |============================================= | 65% | |============================================== | 65% | |============================================== | 66% | |=============================================== | 66% | |=============================================== | 67% | |=============================================== | 68% | |================================================ | 68% | |================================================ | 69% | |================================================= | 69% | |================================================= | 70% | |================================================= | 71% | |================================================== | 71% | |================================================== | 72% | |=================================================== | 72% | |=================================================== | 73% | |=================================================== | 74% | |==================================================== | 74% | |==================================================== | 75% | |===================================================== | 75% | |===================================================== | 76% | |====================================================== | 76% | |====================================================== | 77% | |====================================================== | 78% | |======================================================= | 78% | |======================================================= | 79% | |======================================================== | 79% | |======================================================== | 80% | |======================================================== | 81% | |========================================================= | 81% | |========================================================= | 82% | |========================================================== | 82% | |========================================================== | 83% | |========================================================== | 84% | |=========================================================== | 84% | |=========================================================== | 85% | |============================================================ | 85% | |============================================================ | 86% | |============================================================= | 86% | |============================================================= | 87% | |============================================================= | 88% | |============================================================== | 88% | |============================================================== | 89% | |=============================================================== | 89% | |=============================================================== | 90% | |=============================================================== | 91% | |================================================================ | 91% | |================================================================ | 92% | |================================================================= | 92% | |================================================================= | 93% | |================================================================= | 94% | |================================================================== | 94% | |================================================================== | 95% | |=================================================================== | 95% | |=================================================================== | 96% | |==================================================================== | 96% | |==================================================================== | 97% | |==================================================================== | 98% | |===================================================================== | 98% | |===================================================================== | 99% | |======================================================================| 99% | |======================================================================| 100%
close(pb)
cat("Successfully fetch movie details!")
## Successfully fetch movie details!
all_genres
## [1] "Action" "Adventure" "Fantasy" "Science Fiction"
## [5] "Drama" "Romance" "Animation" "Comedy"
## [9] "Family" "Thriller" "Crime" "History"
## [13] "Music" "War" "Mystery" "Horror"
## [17] "Western" "Documentary"
cat(paste0(BASE_URL, MOVIE_PATH, 1231, "credits"))
## https://api.themoviedb.org/3/movie/1231credits
pb <- txtProgressBar(min = 0, max = nrow(movies), style = 3)
## | | | 0%
movies$cast <- NA # Stores top 5 actors
movies$non_cast <- NA # Stores top 5 crew members (excluding actors)
for (i in 1:nrow(movies)) {
movie_id <- movies$movie_id[i]
response <- GET(
url = paste0(BASE_URL, MOVIE_PATH, movie_id, "/credits"),
query = list(
api_key = API_KEY
)
)
# Check if API request was successful
if (status_code(response) != 200) {
print(paste("Skipping movie", movie_id, "- API error:", status_code(response)))
next
}
credits <- fromJSON(content(response, as = "text"))
# Extract CAST (Actors)
if (!is.null(credits$cast) && length(credits$cast) > 0) {
# Filter only actors (those with a "character" field)
cast_members <- credits$cast[!is.null(credits$cast$character), ]
top_cast <- head(cast_members, 5)
# Format: "Actor1 (popularity), Actor2 (popularity)"
movies$cast[i] <- paste0(top_cast$name, "(", round(top_cast$popularity, 1), ")", collapse = ", ")
movies$actor_avg_pop[i] <- mean(top_cast$popularity)
} else {
movies$cast[i] <- NA # Handle missing actor
movies$actor_avg_pop[i] <- NA
}
# Extract NON-CAST (Directors, Producers, etc.)
if (!is.null(credits$crew) && length(credits$crew) > 0) {
# Exclude actors (crew members without "character" role)
non_cast_members <- credits$crew[is.null(credits$crew$character), ]
top_non_cast <- head(non_cast_members, 5)
# Format: "Crew1 (popularity), Crew2 (popularity)"
movies$non_cast[i] <- paste0(top_non_cast$name, "(", round(top_non_cast$popularity, 1), ")", collapse = ", ")
movies$crew_avg_pop[i] <- mean(top_non_cast$popularity)
} else {
movies$non_cast[i] <- NA # Handle missing crew
movies$crew_avg_pop[i] <- NA
}
setTxtProgressBar(pb, i)
}
## | | | 1% | |= | 1% | |= | 2% | |== | 2% | |== | 3% | |== | 4% | |=== | 4% | |=== | 5% | |==== | 5% | |==== | 6% | |===== | 6% | |===== | 7% | |===== | 8% | |====== | 8% | |====== | 9% | |======= | 9% | |======= | 10% | |======= | 11% | |======== | 11% | |======== | 12% | |========= | 12% | |========= | 13% | |========= | 14% | |========== | 14% | |========== | 15% | |=========== | 15% | |=========== | 16% | |============ | 16% | |============ | 17% | |============ | 18% | |============= | 18% | |============= | 19% | |============== | 19% | |============== | 20% | |============== | 21% | |=============== | 21% | |=============== | 22% | |================ | 22% | |================ | 23% | |================ | 24% | |================= | 24% | |================= | 25% | |================== | 25% | |================== | 26% | |=================== | 26% | |=================== | 27% | |=================== | 28% | |==================== | 28% | |==================== | 29% | |===================== | 29% | |===================== | 30% | |===================== | 31% | |====================== | 31% | |====================== | 32% | |======================= | 32% | |======================= | 33% | |======================= | 34% | |======================== | 34% | |======================== | 35% | |========================= | 35% | |========================= | 36% | |========================== | 36% | |========================== | 37% | |========================== | 38% | |=========================== | 38% | |=========================== | 39% | |============================ | 39% | |============================ | 40% | |============================ | 41% | |============================= | 41% | |============================= | 42% | |============================== | 42% | |============================== | 43% | |============================== | 44% | |=============================== | 44% | |=============================== | 45% | |================================ | 45% | |================================ | 46% | |================================= | 46% | |================================= | 47% | |================================= | 48% | |================================== | 48% | |================================== | 49% | |=================================== | 49% | |=================================== | 50% | |=================================== | 51% | |==================================== | 51% | |==================================== | 52% | |===================================== | 52% | |===================================== | 53% | |===================================== | 54% | |====================================== | 54% | |====================================== | 55% | |======================================= | 55% | |======================================= | 56% | |======================================== | 56% | |======================================== | 57% | |======================================== | 58% | |========================================= | 58% | |========================================= | 59% | |========================================== | 59% | |========================================== | 60% | |========================================== | 61% | |=========================================== | 61% | |=========================================== | 62% | |============================================ | 62% | |============================================ | 63% | |============================================ | 64% | |============================================= | 64% | |============================================= | 65% | |============================================== | 65% | |============================================== | 66% | |=============================================== | 66% | |=============================================== | 67% | |=============================================== | 68% | |================================================ | 68% | |================================================ | 69% | |================================================= | 69% | |================================================= | 70% | |================================================= | 71% | |================================================== | 71% | |================================================== | 72% | |=================================================== | 72% | |=================================================== | 73% | |=================================================== | 74% | |==================================================== | 74% | |==================================================== | 75% | |===================================================== | 75% | |===================================================== | 76% | |====================================================== | 76% | |====================================================== | 77% | |====================================================== | 78% | |======================================================= | 78% | |======================================================= | 79% | |======================================================== | 79% | |======================================================== | 80% | |======================================================== | 81% | |========================================================= | 81% | |========================================================= | 82% | |========================================================== | 82% | |========================================================== | 83% | |========================================================== | 84% | |=========================================================== | 84% | |=========================================================== | 85% | |============================================================ | 85% | |============================================================ | 86% | |============================================================= | 86% | |============================================================= | 87% | |============================================================= | 88% | |============================================================== | 88% | |============================================================== | 89% | |=============================================================== | 89% | |=============================================================== | 90% | |=============================================================== | 91% | |================================================================ | 91% | |================================================================ | 92% | |================================================================= | 92% | |================================================================= | 93% | |================================================================= | 94% | |================================================================== | 94% | |================================================================== | 95% | |=================================================================== | 95% | |=================================================================== | 96% | |==================================================================== | 96% | |==================================================================== | 97% | |==================================================================== | 98% | |===================================================================== | 98% | |===================================================================== | 99% | |======================================================================| 99% | |======================================================================| 100%
close(pb)
colnames(movies) <- c(
"movie_id", # Unique movie ID
"title", # Movie title
"revenue", # Box office revenue
"adult", # Whether the movie is rated adult (TRUE/FALSE)
"popularity", # Popularity score from TMDB
"budget", # Production budget
"release_date", # Release date
"runtime", # Duration in minutes
"genres", # Movie genres
"actors", # Top 5 actors
"crews", # Top 5 crew members
"actor_avg_pop", # Average popularity of the top 5 actors
"crew_avg_pop" # Average popularity of the top 5 crew members
)
colnames(movies)
## [1] "movie_id" "title" "revenue" "adult"
## [5] "popularity" "budget" "release_date" "runtime"
## [9] "genres" "actors" "crews" "actor_avg_pop"
## [13] "crew_avg_pop"
movies_original <- movies
release_date to datemovies$release_date <- as.Date(movies$release_date)
colSums(is.na(movies))
## movie_id title revenue adult popularity
## 0 0 0 0 0
## budget release_date runtime genres actors
## 0 6 0 0 3
## crews actor_avg_pop crew_avg_pop
## 2 3 2
cat("Summary for movie_id:\n")
## Summary for movie_id:
summary(movies$movie_id)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11 4560 49033 201106 330111 1462748
cat("Number of empty title: ", sum(movies$title == ""), "\n")
## Number of empty title: 0
cat("Summary for revenue:\n")
## Summary for revenue:
summary(movies$revenue)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.915e+08 2.448e+08 3.364e+08 4.421e+08 5.087e+08 2.924e+09
cat("Unique values of adult:\n")
## Unique values of adult:
unique(movies$adult)
## [1] FALSE
cat("Summary for popularity:\n")
## Summary for popularity:
summary(movies$popularity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 7.047 11.082 15.143 16.706 485.233
cat("Summary for budget:\n")
## Summary for budget:
summary(movies$budget)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 40000000 84000000 96251576 145000000 460000000
cat("Summary for release_date:\n")
## Summary for release_date:
summary(movies$release_date)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## "1939-12-15" "2002-12-18" "2011-09-03" "2009-05-06" "2017-07-03" "2025-03-31"
## NA's
## "6"
cat("Summary for runtime:\n")
## Summary for runtime:
summary(movies$runtime)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 102 116 118 131 233
cat("Unique values of genres:\n")
## Unique values of genres:
unique(unique(unlist(strsplit(movies$genres, ","))))
## [1] "Action" "Adventure" "Fantasy" "Science Fiction"
## [5] "Drama" "Romance" "Animation" "Comedy"
## [9] "Family" "Thriller" "Crime" "History"
## [13] "Music" "War" "Mystery" "Horror"
## [17] "Western" "Documentary"
We observe that all values are normal, except that adult is always false, some movies have 0 budget, some movies have 0 runtime, some movies don’t have any actor or crew, and there are no release date for 4 movies.
Since adult is always false, we’re not going to include it into our analysis since it doesn’t provide any insight.
movies <- select(movies, -adult)
zero_budget <- movies$budget == 0
movies[zero_budget,]
## movie_id
## 56 1452397
## 163 1066298
## 257 1462748
## 286 1299537
## 294 1228891
## 302 801803
## 313 612845
## 323 1357305
## 338 715904
## 386 923192
## 395 334298
## 415 538331
## 449 497984
## 455 372058
## 498 1147596
## 502 15947
## 507 479306
## 530 916224
## 695 783675
## 714 1089654
## 746 605375
## 765 635389
## 778 1022145
## 802 170882
## 805 362682
## 820 575813
## 825 457251
## 835 1078012
## 863 716815
## 869 170657
## 899 803266
## 905 508747
## 928 513692
## 952 168626
## 957 1462164
## 960 240417
## 979 810693
## 995 1300945
## title
## 56 No Need For Help
## 163 Full River Red
## 257 DUDURITMUS
## 286 Successor
## 294 Pegasus 2
## 302 Moon Man
## 313 My People, My Country
## 323 Detective Chinatown 1900
## 338 Metallica: WorldWired Tour - Live in Manchester, England - June 18, 2019
## 386 Too Cool to Kill
## 395 Monster Hunt
## 415 Hello Mr. Billionaire
## 449 Monster Hunt 2
## 455 Your Name.
## 498 Article 20
## 502 The Three Caballeros
## 507 Never Say Die
## 530 Suzume
## 695 The First Slam Dunk
## 714 Chang'an
## 746 The Bravest
## 765 Jiang Ziya
## 778 Home Coming
## 802 The Black Hand
## 805 Goodbye Mr. Loser
## 820 Better Days
## 825 Youth
## 835 Boonie Bears: Guardian Code
## 863 A Little Red Flower
## 869 Journey to the West: Conquering the Demons
## 899 Nice View
## 905 Us and Them
## 928 Raging Fire
## 952 DeAD
## 957 Jigsaw Prank Gone Wrong
## 960 Seeds of Destiny
## 979 Jujutsu Kaisen 0
## 995 A Place Called Silence
## revenue popularity budget release_date runtime
## 56 1000000000 0.0214 0 <NA> 10
## 163 673556758 0.9452 0 2023-01-22 157
## 257 500000000 0.0000 0 <NA> 24
## 286 473859800 1.6694 0 2024-07-13 133
## 294 468924000 1.2722 0 2024-02-10 121
## 302 460300583 1.4097 0 2022-07-29 122
## 313 450064993 2.0312 0 2019-09-30 154
## 323 441460000 4.8650 0 2025-01-29 135
## 338 426900000 0.9987 0 2020-06-08 150
## 386 393014816 0.4171 0 2022-02-01 109
## 395 387053506 2.4567 0 2015-07-16 118
## 415 376000000 1.3348 0 2018-07-27 118
## 449 361682618 2.6082 0 2018-02-16 110
## 455 358000000 21.6188 0 2016-08-26 106
## 498 337554287 2.0562 0 2024-02-10 141
## 502 336000000 2.1768 0 1944-12-21 71
## 507 334530869 0.4672 0 2017-09-30 100
## 530 323638107 16.3569 0 2022-11-11 121
## 695 258000000 8.4948 0 2022-12-03 125
## 714 252520750 1.8860 0 2023-07-02 168
## 746 245179562 2.2259 0 2019-08-29 118
## 765 240656068 2.5244 0 2020-10-01 105
## 778 236085820 1.1072 0 2022-09-30 137
## 802 230000000 0.7611 0 1973-03-16 90
## 805 228549747 1.3488 0 2015-09-30 104
## 820 225879065 5.1351 0 2019-10-25 135
## 825 224160000 1.5422 0 2017-12-15 136
## 835 221982308 3.4369 0 2023-01-22 94
## 863 216000000 0.9708 0 2020-12-31 128
## 869 215637183 2.9811 0 2013-02-07 110
## 899 211018982 1.6484 0 2022-02-01 106
## 905 209221331 4.0993 0 2018-04-28 120
## 928 205000000 3.3429 0 2021-07-28 125
## 952 201103001 0.3323 0 2013-02-10 104
## 957 200000000 0.0000 0 2024-12-30 5
## 960 200000000 0.5544 0 1946-01-01 20
## 979 195870885 19.7894 0 2021-12-24 105
## 995 192856689 2.6183 0 2024-07-03 119
## genres
## 56
## 163 Mystery,Comedy,Thriller,History
## 257
## 286 Comedy
## 294 Drama,Comedy,Adventure
## 302 Science Fiction,Comedy,Drama
## 313 Drama,History
## 323 Comedy,Mystery,Action
## 338 Music
## 386 Action,Comedy
## 395 Comedy,Fantasy,Adventure
## 415 Comedy
## 449 Adventure,Comedy,Fantasy
## 455 Animation,Romance,Drama
## 498 Drama,Comedy
## 502 Animation,Family,Music
## 507 Comedy,Fantasy
## 530 Animation,Drama,Adventure,Fantasy
## 695 Animation,Comedy,Drama
## 714 Animation,History
## 746 Drama,Action
## 765 Animation,Action,Fantasy,Adventure
## 778 Drama,War
## 802 Crime
## 805 Romance,Comedy
## 820 Drama,Crime,Romance
## 825 Drama,History
## 835 Animation,Comedy,Science Fiction,Family,Adventure
## 863 Drama,Family
## 869 Action,Fantasy,Adventure,Comedy
## 899 Drama,Family,Comedy
## 905 Drama,Romance
## 928 Action,Crime,Thriller,Adventure,Mystery
## 952 Comedy,Drama,Thriller
## 957 Comedy
## 960 Documentary,War
## 979 Animation,Action,Fantasy
## 995 Crime,Mystery,Thriller
## actors
## 56 Sam N(0)
## 163 Teng Shen(1), Jackson Yee(2.3), Zhang Yi(0.7), Lei Jiayin(0.7), Yue Yunpeng(0.4)
## 257 <NA>
## 286 Teng Shen(1), Ma Li(0.6), Shi Pengyuan(0.3), Sa Rina(0.5), Xiao Bochen(0.1)
## 294 Teng Shen(1), Yin Zheng(0.7), Zhang Benyu(0.2), Fan Chengcheng(0.7), Sun Yizhou(0.5)
## 302 Teng Shen(1), Ma Li(0.6), Chang Yuan(0.3), Li Chengru(0.6), Cailun Huang(0.2)
## 313 Ge You(0.5), Huang Bo(1), Zhang Yi(0.7), Simon Yam(1), Song Jia(1.6)
## 323 Wang Baoqiang(0.7), Liu Haoran(0.8), Chow Yun-Fat(1.9), White-K(0.6), Zhang Xincheng(1.8)
## 338 James Hetfield(0.5), Lars Ulrich(0.2), Kirk Hammett(0.3), Robert Trujillo(0.2)
## 386 Ma Li(0.6), Wei Xiang(0.5), Allen Ai Lun(0.5), Chen Minghao(0.4), Cailun Huang(0.2)
## 395 Bai Baihe(0.6), Jing Boran(0.8), Jiang Wu(0.6), Elaine Jin Yan-Ling(0.4), Wallace Chung(0.7)
## 415 Teng Shen(1), Vivian Sung(0.5), Zhang Yiming(0.1), Morning Chang(0.4), Chang Yuan(0.3)
## 449 Tony Leung(1.5), Bai Baihe(0.6), Jing Boran(0.8), Chris Lee Yuchun(0.5), Tony Yang(0.9)
## 455 Ryunosuke Kamiki(1.1), Mone Kamishiraishi(1.3), Ryo Narita(0.8), Aoi Yuki(1.6), Nobunaga Shimazaki(1.5)
## 498 Lei Jiayin(0.7), Ma Li(0.6), Gao Ye(0.7), Zhao Liying(5.9), Liu Yaowen(0.2)
## 502 Clarence Nash(0.5), Sterling Holloway(0.6), Joaquin Garay(0.1), José Oliveira(0.1), Aurora Miranda(0.2)
## 507 Allen Ai Lun(0.5), Ma Li(0.6), Teng Shen(1), Xue Haowen(0.4), Chang Yuan(0.3)
## 530 Nanoka Hara(0.6), Hokuto Matsumura(0.7), Eri Fukatsu(1), Shota Sometani(0.7), Sairi Ito(0.9)
## 695 Shugo Nakamura(0.4), Jun Kasama(0.4), Kenta Miyake(1.3), Shinichiro Kamio(0.6), Subaru Kimura(1)
## 714 Yang Tianxiang(0.4), Ling Zhenhe(0.2), Junquan Wu(0.1), Xuan Xiaoming(0.2), Lu Lifeng(0.1)
## 746 Huang Xiaoming(1.2), Du Jiang(0.4), Tan Zhuo(0.5), Yang Zi(3.5), Ou Hao(3.7)
## 765 Zheng Xi(0.1), Jiang Guangtao(0.5), Ji Guanlin(0.2), Sheng Feng(0.5), Yang Ning(0.3)
## 778 Zhang Yi(0.7), Karry Wang(0.8), Yin Tao(0.9), Cheng Taishen(0.4), Zhang Zixian(0.5)
## 802 Lionel Stander(0.5), Rosanna Fratello(0.1), Michele Placido(0.7), Corrado Gaipa(0.2), Annie Carol Edel(0.2)
## 805 Teng Shen(1), Ma Li(0.6), Allen Ai Lun(0.5), Yin Zheng(0.7), Chang Yuan(0.3)
## 820 Zhou Dongyu(1), Jackson Yee(2.3), Yin Fang(0.4), Huang Jue(0.6), Wu Yue(0.4)
## 825 Huang Xuan(0.9), Miao Miao(0.4), Elane Zhong(1.3), Yang Caiyu(0.4), Li Xiaofeng(0.3)
## 835 Zhang Bingjun(0.1), Zhang Wei(0.1), Yingying Miu(0.1), Tan Xiao(0.2), 贾晨露(0.1)
## 863 Jackson Yee(2.3), Liu Haocun(1.4), Gao Yalin(0.4), Zhu Yuanyuan(0.3), Yu Xia(0.7)
## 869 Wen Zhang(0.4), Shu Qi(2.9), Huang Bo(1), Show Lo(0.5), Lee Sheung-Ching(0.2)
## 899 Jackson Yee(2.3), Tian Yu(0.4), Zhang Yu(0.6), Eric Wang(0.6), Qi Xi(0.6)
## 905 Jing Boran(0.8), Zhou Dongyu(1), Tian Zhuangzhuang(0.3), Qu Zheming(0.2), Zhang Zixian(0.5)
## 928 Donnie Yen(2.7), Nicholas Tse Ting-Fung(1.8), Qin Lan(1.4), Ray Lui(1.1), Patrick Tam(0.9)
## 952 Tilman Strauss(0.1), Thomas Schendel(0.1), Judith Rosmair(0.1), Niklas Kohrt(0.1), Ruby O. Fee(1.6)
## 957 Samuel Day(0), Henry Kieffer(0)
## 960 Ralph Bellamy(0.7)
## 979 Megumi Ogata(1.7), Kana Hanazawa(2), Yuichi Nakamura(1.4), Takahiro Sakurai(2.3), Mikako Komatsu(1.4)
## 995 Eric Wang(0.6), Janine Chang(0.9), Francis Ng Chun-Yu(1.7), Wang Shengdi(0.7), Cai Ming(0.5)
## crews
## 56 Aidan Wainwright(0)
## 163 Zhang Yimou(0.6), Yiqun Jia(0.1), Li Yongyi(0), Zhao Xiaoding(0.1), Han Hong(0.1)
## 257 DJANITA(0)
## 286 Peng Damo(0.1), Yan Fei(0.1), Yan Fei(0.1), Peng Damo(0.1), Lin Bingbao(0)
## 294 Han Han(0.3), Han Han(0.3), Li Jie(0.1), Lu Jia(0), Chen Zhixi(0.2)
## 302 Yuyue Shen(0.1), Chenguang Qian(0), Si'ao Dai(0), Zhang Chiyu(0), Zhang Li(0)
## 313 Faye Wong(0.6), Wang Yibing(0.1), Binxing Fu(0.1), Xiaobei Cao(0), Xin Liu(0)
## 323 Chen Sicheng(0.7), Chen Sicheng(0.7), Dai Mo(0.1), Nathan Wang(0.4), Chen Sicheng(0.7)
## 338 Lug Zajonc(0), Dan Braun(0)
## 386 Xing Wenxiong(0), Xing Wenxiong(0), Fei Peng(0)
## 395 Raman Hui Shing-Ngai(0.4), Alan Yuen(0.3), Alan Yuen(0.3), Hao Lee(0), Ho Yiu-Leung(0.1)
## 415 Duan Aojuan(0.1), Meng Meiqi(0.2), Xu Mengjie(0.7), Yamy(0.1), Sunnee(0.1)
## 449 Su Liang(0.2), Hao Lee(0), Shadow Hung(0), Ho Yiu-Leung(0.1), Raman Hui Shing-Ngai(0.4)
## 455 Masanori Yumiya(0), Shinichirou Inoue(0.1), Tatsuro Hatanaka(0.1), Junji Zenki(0.1), Ken Sakamoto(0.1)
## 498 Li Meng(0.1), Xiaobei Cao(0), Li Chuan-Long(0), Zhao Xiaoding(0.1), Chen Minzheng(0.1)
## 502 Billy Daniel(0.1), Walt Disney(0.7), Donald Halliday(0.1), Norman Ferguson(0.2), Eric Larson(0.2)
## 507 Zhang Chiyu(0), Zhang Chiyu(0), Derek Hui Wang-Yu(0.2), Si'ao Dai(0), Yuyue Shen(0.1)
## 530 Yojiro Noda(0.5), Yojiro Noda(0.5), Kazuma Jinnouchi(0.2), Toaka(0), Yusuke Takeda(0.1)
## 695 Takehiko Inoue(0.3), Yasuyuki Ebara(0.3), Haruka Kamatani(0), Ryuichi Takita(0.1), Yota Tsuruoka(0.5)
## 714 Xie Junwei(0.1), Zou Jing(0), Gary Wang(0.2), Li Bai(0), Guo Haowei(0.1)
## 746 Peter Kam Pau-Tat(0.2), Shengji Liang(0), Yu Yonggan(0.1), Barfuss Hui(0), Tony Chan(0.2)
## 765 Li Ruijie(0.1), Jing Wang(0.1), Li Ruijie(0.1), Wei Yunyun(0.1), Wang Changtian(0.1)
## 778 Frant Gwo(0.8), Rao Xiaozhi(0.2), Faye Wong(0.6), Shi Ce(0), Lei Zhilong(0.1)
## 802 Antonio Racioppi(0.1), Gino Capone(0.2), Carlo Rustichelli(0.1), Ignazio Dolce(0.2), Alessandro Fallai(0)
## 805 Derek Hui Wang-Yu(0.2), Yan Fei(0.1), Peng Damo(0.1), Zhou Xiaolin(0), Tian Tian(0.1)
## 820 Gao Ke(0), Peter Chan(0.4), Lili Sun(0.1), Shujie Zong(0), Hong Qin(0.1)
## 825 Yu Gong(0.8), Feng Xiaogang(0.6), Yan Geling(0.5), Luo Pan(0), 张琪(0)
## 835 Yongchang Lin(0.1), Heqi Shao(0), Yongchang Lin(0.1), Heqi Shao(0), Tiezhi Cui(0.1)
## 863 Wang Gang(0.1), Rui Zhong(0), Zhao Yingjun(0.2), Yan Han(0.4), Jiawei Jia(0.1)
## 869 Zhang Dajun(0.1), Ivy Kong Yuk-Yee(0.2), Stephen Chow(3.2), Stephen Chow(3.2), Ellen Eliasoph(0)
## 899 Huang Chao(0), Wei Zhong(0.1), Mingyi Liu(0), Wang Yibing(0.1), Wen Muye(0.2)
## 905 Yuan Yuan(0.2), Yu Pan(0), Dong Ping(0.2), Liu Rong(0), Zheng Zhihao(0.1)
## 928 Flora Au Wai-Ling(0.1), Nicolas Errèra(0.1), Tim Tong Yiu-Leung(0.1), Benny Chan Muk-Sing(0.6), Nicholas Tse Ting-Fung(1.8)
## 952 Carol Burandt von Kameke(0), Sven Halfar(0), Sven Halfar(0)
## 957 Henry Kieffer(0), Henry Kieffer(0), Henry Kieffer(0), Henry Kieffer(0)
## 960 David Miller(0.2), Art Arthur(0.1), Gene Fowler Jr.(0.1), David Miller(0.2)
## 979 Gege Akutami(0.2), Katsuhiro Nakano(0.3), Alisa Okehazama(0.1), Tadashi Hiramatsu(0.4), Sunghoo Park(0.4)
## 995 Boon-lip Quah(0.1), Boon-lip Quah(0.1), You Wenwei(0.1), Zhang Ying(0)
## actor_avg_pop crew_avg_pop
## 56 0.00000 0.000000
## 163 1.02544 0.202460
## 257 NA 0.000000
## 286 0.51680 0.091820
## 294 0.63448 0.169860
## 302 0.53308 0.040080
## 313 0.96548 0.167260
## 323 1.16570 0.524720
## 338 0.27175 0.013050
## 386 0.44024 0.036300
## 395 0.63568 0.215140
## 415 0.46784 0.251380
## 449 0.86762 0.142740
## 455 1.26204 0.076240
## 498 1.59452 0.061680
## 502 0.28960 0.268240
## 507 0.56262 0.063580
## 530 0.79050 0.274200
## 695 0.74016 0.220360
## 714 0.20390 0.073980
## 746 1.84240 0.115940
## 765 0.32410 0.097660
## 778 0.67318 0.333360
## 802 0.33376 0.118660
## 805 0.63026 0.099260
## 820 0.92132 0.117140
## 825 0.65676 0.408740
## 835 0.09998 0.081580
## 863 1.00314 0.165580
## 869 1.00910 1.343680
## 899 0.89386 0.080240
## 905 0.54952 0.097180
## 928 1.56860 0.516760
## 952 0.41146 0.025500
## 957 0.00000 0.000000
## 960 0.70960 0.152525
## 979 1.77070 0.270960
## 995 0.86034 0.086000
zero_runtime <- movies$runtime == 0
movies[zero_runtime,]
## movie_id title revenue popularity
## 231 1454250 Salada. 541441574 0.0071
## 258 1432484 The World of the Pharaoh (Live Theatre) 500000000 0.0338
## budget release_date runtime genres actors
## 231 40 2025-03-25 0 Crime Matheus Ribeiro(0), Salada(0), Garfo(0)
## 258 1000000 2025-02-09 0 Martín Cirio(0)
## crews actor_avg_pop crew_avg_pop
## 231 Pedro Augusto(0), Pedro Augusto(0) 0.006433333 0.00355
## 258 Martín Cirio(0) 0.032700000 0.03270
movies <- filter(movies, budget > 0 & runtime > 0)
actors_na <- is.na(movies$actors)
movies[actors_na,]
## movie_id title revenue popularity budget release_date
## 55 1454531 KUCHAR x CLINTON 1000000000 0.0071 1000000 <NA>
## 920 1148594 ere 200202022 0.1475 1209097 <NA>
## runtime genres actors crews actor_avg_pop crew_avg_pop
## 55 41 <NA> Derwood Shmedley(0) NA 0
## 920 120 <NA> <NA> NA NA
crews_na <- is.na(movies$crews)
movies[crews_na,]
## movie_id title revenue popularity budget
## 364 1425681 Lyna Live From Buenos Aires: Gran Rex 400000000 0.0214 1000000
## 920 1148594 ere 200202022 0.1475 1209097
## release_date runtime genres actors crews actor_avg_pop crew_avg_pop
## 364 2023-01-01 90 Family,Music Lyna(0) <NA> 0.0071 NA
## 920 <NA> 120 <NA> <NA> NA NA
release_date_na <- is.na(movies$release_date)
movies[release_date_na,]
## movie_id title revenue popularity budget release_date
## 55 1454531 KUCHAR x CLINTON 1000000000 0.0071 1000000 <NA>
## 920 1148594 ere 200202022 0.1475 1209097 <NA>
## 921 1433493 Deerfeat 200000000 0.0214 400 <NA>
## 922 1034806 The Pint 200000000 0.0264 3 <NA>
## runtime genres
## 55 41
## 920 120
## 921 5 Action,Horror,Mystery
## 922 5
## actors
## 55 <NA>
## 920 <NA>
## 921 Justice Gondek(0), Justice Gondek(0)
## 922 Joseph Glenn(0), Iain Bayfield(0.1), Callum White(0), Henry Swindells(0), Uilleam MacDougall(0)
## crews actor_avg_pop crew_avg_pop
## 55 Derwood Shmedley(0) NA 0.0000
## 920 <NA> NA NA
## 921 Todd Gondek(0) 0.01430 0.0000
## 922 Zoey Thomas(0) 0.02908 0.0143
movies <- filter(movies, !is.na(release_date) & !is.na(actors) & !is.na(crews))
colSums(is.na(movies))
## movie_id title revenue popularity budget
## 0 0 0 0 0
## release_date runtime genres actors crews
## 0 0 0 0 0
## actor_avg_pop crew_avg_pop
## 0 0
cat("Summary for budget:\n")
## Summary for budget:
summary(movies$budget)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 50000 50000000 88000000 100782583 150000000 460000000
cat("Summary for runtime:\n")
## Summary for runtime:
summary(movies$runtime)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.0 103.0 116.0 118.9 131.5 233.0
movies <- filter(movies, runtime != 4)
summary(movies$runtime)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 70.0 103.0 116.5 119.0 131.8 233.0
movies[movies$release_year == 2025,]
## [1] movie_id title revenue popularity budget
## [6] release_date runtime genres actors crews
## [11] actor_avg_pop crew_avg_pop
## <0 rows> (or 0-length row.names)
# Extract year from release date
movies$release_year <- year(movies$release_date)
movies <- filter(movies, release_year != 2025)
Since there are only two movies from 2025, we’re going to remove movies from 2025 so that our analysis is not biased
release_date to seasonmovies$season <- as.factor(case_when(
format(movies$release_date, "%m") %in% c("12", "01", "02") ~ "Winter",
format(movies$release_date, "%m") %in% c("03", "04", "05") ~ "Spring",
format(movies$release_date, "%m") %in% c("06", "07", "08") ~ "Summer",
format(movies$release_date, "%m") %in% c("09", "10", "11") ~ "Fall",
TRUE ~ "Unknown"
))
levels(movies$season)
## [1] "Fall" "Spring" "Summer" "Winter"
genres into one-hot encodingmovies <- movies |>
separate_rows(genres, sep = ",") |>
mutate(value = 1) |>
pivot_wider(names_from = genres, values_from = value, values_fill = list(value = 0))
# Take only movies from 1947 since the data for CPI starts from 1947
cpi <- read.csv("CPIAUCSL.csv")
cpi$observation_month <- as.numeric(substring(cpi$observation_date, 6, 7))
cpi$observation_year <- as.numeric(substring(cpi$observation_date, 1, 4))
movies$release_month <- as.numeric(substring(movies$release_date, 6, 7))
movies <- filter(movies, release_year >= 1946) |>
left_join(cpi, by = c("release_year" = "observation_year", "release_month" = "observation_month"))
# Check if there are any missing values after the join
missing_values <- movies[!complete.cases(movies), ]
missing_values
## # A tibble: 0 × 34
## # ℹ 34 variables: movie_id <int>, title <chr>, revenue <dbl>, popularity <dbl>,
## # budget <int>, release_date <date>, runtime <int>, actors <chr>,
## # crews <chr>, actor_avg_pop <dbl>, crew_avg_pop <dbl>, release_year <dbl>,
## # season <fct>, Action <dbl>, Adventure <dbl>, Fantasy <dbl>,
## # Science Fiction <dbl>, Drama <dbl>, Romance <dbl>, Animation <dbl>,
## # Comedy <dbl>, Family <dbl>, Thriller <dbl>, Crime <dbl>, History <dbl>,
## # Music <dbl>, War <dbl>, Mystery <dbl>, Horror <dbl>, Western <dbl>, …
# Compute average revenue per year
yearly_budget <- movies |>
group_by(release_year) |>
summarise(avg_budget = mean(budget, na.rm = TRUE), movie_count = n()) |>
arrange(release_year)
# Plot time series
ggplot(yearly_budget, aes(x = release_year, y = avg_budget)) +
geom_line(color = "#e74c3c", size = 1.5) + # Bold red line
geom_point(color = "#2c3e50", size = 3) + # Dark gray dots
theme_minimal() +
labs(
title = "Average Movie Budget Over Time",
x = "Release Year",
y = "Average Budget ($)"
) +
scale_y_continuous(labels = scales::dollar_format()) + # Format revenue as dollars
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "#2c3e50"),
axis.text = element_text(size = 12),
axis.title = element_text(size = 13, face = "bold")
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Get largest release date to convert to
latest_release_date <- max(movies$release_date, na.rm = TRUE)
latest_release_month <- as.numeric(substring(latest_release_date, 6, 7))
latest_release_year <- as.numeric(substring(latest_release_date, 1, 4))
# Adjust budget for inflation
cpi_latest <- filter(cpi, observation_year == latest_release_year & observation_month == latest_release_month)$CPIAUCSL
movies <- movies |>
mutate(budget = budget * (cpi_latest / CPIAUCSL))
# Replot the time series for budget
yearly_budget <- movies |>
group_by(release_year) |>
summarise(avg_budget = mean(budget, na.rm = TRUE), movie_count = n()) |>
arrange(release_year)
# Plot time series
ggplot(yearly_budget, aes(x = release_year, y = avg_budget)) +
geom_line(color = "#e74c3c", size = 1.5) + # Bold red line
geom_point(color = "#2c3e50", size = 3) + # Dark gray dots
theme_minimal() +
labs(
title = "Average Movie Budget Over Time",
x = "Release Year",
y = "Average Budget ($)"
) +
scale_y_continuous(labels = scales::dollar_format()) + # Format revenue as dollars
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "#2c3e50"),
axis.text = element_text(size = 12),
axis.title = element_text(size = 13, face = "bold")
)
store_movies <- movies
store_movies[] <- lapply(movies, function(x) {
if (is.character(x)) Encoding(x) <- "UTF-8"
return(x)
})
saveRDS(store_movies, file = "movies_cleaned.rds")
cat("Summary statistics for key numeric or date variables:\n")
## Summary statistics for key numeric or date variables:
summary(select(movies, revenue, popularity, budget, release_date, runtime, actor_avg_pop, crew_avg_pop))
## revenue popularity budget
## Min. :1.915e+08 Min. : 0.3816 Min. : 114314
## 1st Qu.:2.457e+08 1st Qu.: 7.6284 1st Qu.: 79038787
## Median :3.390e+08 Median : 11.2986 Median :132731340
## Mean :4.453e+08 Mean : 14.8793 Mean :145774730
## 3rd Qu.:5.242e+08 3rd Qu.: 16.9450 3rd Qu.:207913124
## Max. :2.924e+09 Max. :147.0271 Max. :535446283
## release_date runtime actor_avg_pop crew_avg_pop
## Min. :1950-02-22 Min. : 72.0 Min. : 0.09326 Min. :0.04116
## 1st Qu.:2002-11-02 1st Qu.:103.0 1st Qu.: 1.95674 1st Qu.:0.19500
## Median :2011-04-11 Median :116.5 Median : 2.91181 Median :0.31440
## Mean :2009-02-20 Mean :119.0 Mean : 3.32663 Mean :0.48676
## 3rd Qu.:2016-12-28 3rd Qu.:131.2 3rd Qu.: 4.21701 3rd Qu.:0.55189
## Max. :2024-12-19 Max. :224.0 Max. :12.08894 Max. :4.61122
cat("\nCount of movies for each season:")
##
## Count of movies for each season:
table(movies$season)
##
## Fall Spring Summer Winter
## 215 227 302 204
# Original scale
ggplot(movies, aes(x = revenue)) +
geom_histogram(bins = 30, fill = "#3498db", color = "white", alpha = 0.8) +
theme_minimal() +
labs(title = "Distribution of Movie Revenue",
x = "Revenue",
y = "Frequency") +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
# Log scale
ggplot(movies, aes(x = log10(revenue))) +
geom_histogram(bins = 30, fill = "#3498db", color = "white", alpha = 0.8) +
theme_minimal() +
labs(title = "Distribution of Movie Revenue (Log Scale)",
x = "Log10(Revenue)",
y = "Frequency") +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
ggplot(movies, aes(x = budget, y = revenue)) +
geom_point(alpha = 0.4, color = "#e74c3c") +
geom_smooth(method = "lm", color = "#2c3e50", se = FALSE, linetype = "dashed") +
theme_minimal() +
labs(title = "Budget vs. Revenue",
x = "Budget",
y = "Revenue") +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
## `geom_smooth()` using formula = 'y ~ x'
ggplot(movies, aes(x = log10(budget), y = log10(revenue))) +
geom_point(alpha = 0.4, color = "#e74c3c") +
geom_smooth(method = "lm", color = "#2c3e50", se = FALSE, linetype = "dashed") +
theme_minimal() +
labs(title = "Budget vs. Revenue",
x = "Log10(Budget)",
y = "Log10(Revenue)") +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
## `geom_smooth()` using formula = 'y ~ x'
## Revenue vs. Popularity
ggplot(movies, aes(x = popularity, y = revenue)) +
geom_point(alpha = 0.5, color = "#3498db") +
geom_smooth(method = "lm", color = "#2c3e50", se = FALSE, linetype = "dashed") +
theme_minimal() +
labs(
title = "Movie Popularity vs. Revenue",
x = "TMDb Popularity Score",
y = "Revenue"
) +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.text = element_text(size = 12),
axis.title = element_text(size = 13, face = "bold")
)
## `geom_smooth()` using formula = 'y ~ x'
ggplot(movies, aes(x = popularity, y = log10(revenue))) +
geom_point(alpha = 0.5, color = "#3498db") +
geom_smooth(method = "lm", color = "#2c3e50", se = FALSE, linetype = "dashed") +
theme_minimal() +
labs(
title = "Movie Popularity vs. Revenue",
x = "TMDb Popularity Score",
y = "Log10(Revenue)"
) +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.text = element_text(size = 12),
axis.title = element_text(size = 13, face = "bold")
)
## `geom_smooth()` using formula = 'y ~ x'
ggplot(movies, aes(x = runtime, y = log10(revenue))) +
geom_point(alpha = 0.6, color = "#E74C3C") + # Vibrant Coral Red
geom_smooth(method = "lm", color = "#2c3e50", se = FALSE, linetype = "dashed") +
theme_minimal() +
labs(title = "Movie Runtime vs. Revenue",
x = "Runtime (minutes)",
y = "Log10(Revenue)") +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.text = element_text(size = 12),
axis.title = element_text(size = 13, face = "bold")
)
## `geom_smooth()` using formula = 'y ~ x'
ggplot(movies, aes(x = season, y = log10(revenue), fill = season)) +
geom_boxplot(alpha = 0.85, outlier.shape = 21, outlier.fill = "white", outlier.color = "black") +
scale_fill_manual(values = c("Winter" = "#74b9ff",
"Spring" = "#00b894",
"Summer" = "#f9ca24",
"Fall" = "#d35400")) +
theme_minimal() +
labs(title = "Movie Revenue by Release Season",
x = "Season",
y = "Log10(Revenue)") +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
legend.position = "none",
axis.text.x = element_text(face = "bold", size = 12)
)
# Compute average revenue per year
yearly_revenue <- movies |>
group_by(release_year) |>
summarise(avg_revenue = mean(revenue, na.rm = TRUE), movie_count = n()) |>
arrange(release_year)
# Plot time series
ggplot(yearly_revenue, aes(x = release_year, y = avg_revenue)) +
geom_line(color = "#e74c3c", size = 1.5) + # Bold red line
geom_point(color = "#2c3e50", size = 3) + # Dark gray dots
theme_minimal() +
labs(
title = "Average Movie Revenue Over Time",
x = "Release Year",
y = "Average Revenue ($)"
) +
scale_y_continuous(labels = scales::dollar_format()) + # Format revenue as dollars
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "#2c3e50"),
axis.text = element_text(size = 12),
axis.title = element_text(size = 13, face = "bold")
)
# Define colors based on genre themes
genre_colors <- c(
"Action" = "#e74c3c",
"Adventure" = "#f39c12",
"Animation" = "#f1c40f",
"Comedy" = "#2ecc71",
"Crime" = "#34495e",
"Documentary" = "#95a5a6",
"Drama" = "#9b59b6",
"Family" = "#e67e22",
"Fantasy" = "#8e44ad",
"History" = "#d35400",
"Horror" = "#c0392b",
"Music" = "#1abc9c",
"Mystery" = "#16a085",
"Romance" = "#e84393",
"Science Fiction" = "#27ae60",
"TV Movie" = "#7f8c8d",
"Thriller" = "#2c3e50",
"War" = "#bdc3c7",
"Western" = "#8d6e63"
)
# Prepare genre-specific revenue data
genre_revenue <- movies |>
select(starts_with("Action"):starts_with("Documentary"), revenue) |>
pivot_longer(cols = starts_with("Action"):starts_with("Documentary"), names_to = "Genre", values_to = "Presence") |>
filter(Presence == 1)
# Boxplot of revenue by genre
ggplot(genre_revenue, aes(x = reorder(Genre, log10(revenue), FUN = median), y = log10(revenue), fill = Genre)) +
geom_boxplot(alpha = 0.85, outlier.shape = 21, outlier.fill = "white", outlier.color = "black") +
scale_fill_manual(values = genre_colors) +
coord_flip() + # Horizontal layout for readability
theme_minimal() +
labs(title = "Revenue Distribution by Genre",
x = "Genre",
y = "Log10(Revenue)") +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
legend.position = "none",
axis.text.x = element_text(face = "bold", size = 12),
axis.text.y = element_text(face = "bold", size = 10)
)
highest_history_movie <- movies |>
filter(History == 1) |>
arrange(desc(revenue)) |>
slice_max(revenue, n = 1)
highest_history_movie
## # A tibble: 1 × 34
## movie_id title revenue popularity budget release_date runtime actors crews
## <int> <chr> <dbl> <dbl> <dbl> <date> <int> <chr> <chr>
## 1 872585 Oppenhei… 9.52e8 28.0 1.04e8 2023-07-19 181 Cilli… Andr…
## # ℹ 25 more variables: actor_avg_pop <dbl>, crew_avg_pop <dbl>,
## # release_year <dbl>, season <fct>, Action <dbl>, Adventure <dbl>,
## # Fantasy <dbl>, `Science Fiction` <dbl>, Drama <dbl>, Romance <dbl>,
## # Animation <dbl>, Comedy <dbl>, Family <dbl>, Thriller <dbl>, Crime <dbl>,
## # History <dbl>, Music <dbl>, War <dbl>, Mystery <dbl>, Horror <dbl>,
## # Western <dbl>, Documentary <dbl>, release_month <dbl>,
## # observation_date <chr>, CPIAUCSL <dbl>
ggplot(movies, aes(x = actor_avg_pop, y = log10(revenue))) +
geom_point(alpha = 0.5, color = "#9b59b6") +
geom_smooth(method = "lm", col = "#2c3e50", se = FALSE, linetype = "dashed") +
theme_minimal() +
labs(title = "Actor Popularity vs. Revenue",
x = "Average Actor Popularity",
y = "Log10(Revenue)") +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
## `geom_smooth()` using formula = 'y ~ x'
ggplot(movies, aes(x = crew_avg_pop, y = log10(revenue))) +
geom_point(alpha = 0.5, color = "#e74c3c") +
geom_smooth(method = "lm", col = "#2c3e50", se = FALSE, linetype = "dashed") +
theme_minimal() +
labs(title = "Crew Popularity vs. Revenue",
x = "Average Crew Popularity",
y = "Log10(Revenue)") +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.text = element_text(size = 12),
axis.title = element_text(size = 13, face = "bold")
)
## `geom_smooth()` using formula = 'y ~ x'
# Extract actor names without popularity score
top_actors_total <- movies |>
separate_rows(actors, sep = ", ") |>
mutate(actor_name = str_extract(actors, "^[^(]+")) |> # Extract only the name, ignoring numbers
group_by(actor_name) |>
summarise(total_revenue = sum(revenue, na.rm = TRUE)) |>
arrange(desc(total_revenue)) |>
slice_max(total_revenue, n = 20, with_ties = FALSE)
# Plot
ggplot(top_actors_total, aes(x = reorder(actor_name, total_revenue), y = total_revenue, fill = total_revenue)) +
geom_bar(stat = "identity", alpha = 0.9) +
coord_flip() +
scale_fill_gradient(low = "#F5B7B1", high = "#922B21") + # Light Pink → Deep Red
theme_minimal() +
labs(
title = "Top 20 Actors by Total Movie Revenue",
x = "Actor",
y = "Total Revenue ($)",
fill = "Total Revenue"
) +
scale_y_continuous(labels = scales::dollar_format()) +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "#641E16"),
axis.text.x = element_text(angle = 30, hjust = 1, size = 12),
axis.text.y = element_text(size = 12, face = "bold")
)
# Extract top actors by average revenue per movie
top_actors_avg <- movies |>
separate_rows(actors, sep = ", ") |>
mutate(actor_name = str_extract(actors, "^[^(]+")) |>
group_by(actor_name) |>
summarise(avg_revenue = mean(revenue, na.rm = TRUE)) |>
arrange(desc(avg_revenue)) |>
slice_max(avg_revenue, n = 20, with_ties = FALSE)
# Plot
ggplot(top_actors_avg, aes(x = reorder(actor_name, avg_revenue), y = avg_revenue, fill = avg_revenue)) +
geom_bar(stat = "identity", alpha = 0.9) +
coord_flip() +
scale_fill_gradient(low = "#F8DE7E", high = "#D4AC0D") + # Light Gold → Deep Gold
theme_minimal() +
labs(
title = "Top 20 Actors by Average Movie Revenue",
x = "Actor",
y = "Average Revenue per Movie ($)",
fill = "Avg Revenue"
) +
scale_y_continuous(labels = scales::dollar_format()) +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "#7D6608"),
axis.text.x = element_text(angle = 30, hjust = 1, size = 12),
axis.text.y = element_text(size = 12, face = "bold")
)
# Extract top actors by movie count
top_actor_counts <- movies |>
separate_rows(actors, sep = ", ") |>
mutate(actor_name = str_extract(actors, "^[^(]+")) |>
group_by(actor_name) |>
summarise(movie_count = n()) |>
arrange(desc(movie_count)) |>
slice_max(movie_count, n = 20, with_ties = FALSE)
# Plot
ggplot(top_actor_counts, aes(x = reorder(actor_name, movie_count), y = movie_count, fill = movie_count)) +
geom_bar(stat = "identity", alpha = 0.9) +
coord_flip() +
scale_fill_gradient(low = "#FAD7A0", high = "#D35400") + # Light Orange → Deep Orange
theme_minimal() +
labs(
title = "Top 20 Actors by Number of Movies",
x = "Actor",
y = "Movie Count",
fill = "Movie Count"
) +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "#B34700"),
axis.text.x = element_text(angle = 30, hjust = 1, size = 12),
axis.text.y = element_text(size = 12, face = "bold")
)
# Extract top crew members by total revenue
top_crew_total <- movies |>
separate_rows(crews, sep = ", ") |>
mutate(crew_name = str_extract(crews, "^[^(]+")) |>
group_by(crew_name) |>
summarise(total_revenue = sum(revenue, na.rm = TRUE)) |>
arrange(desc(total_revenue)) |>
slice_max(total_revenue, n = 20, with_ties = FALSE)
# Plot
ggplot(top_crew_total, aes(x = reorder(crew_name, total_revenue), y = total_revenue, fill = total_revenue)) +
geom_bar(stat = "identity", alpha = 0.9) +
coord_flip() +
scale_fill_gradient(low = "#A2D9CE", high = "#117A65") + # Light Teal → Deep Green
theme_minimal() +
labs(
title = "Top 20 Crew Members by Total Movie Revenue",
x = "Crew Member",
y = "Total Revenue ($)",
fill = "Total Revenue"
) +
scale_y_continuous(labels = scales::dollar_format()) +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "#154360"),
axis.text.x = element_text(angle = 30, hjust = 1, size = 12),
axis.text.y = element_text(size = 12, face = "bold")
)
# Extract top crew members by average revenue per movie
top_crew_avg <- movies |>
separate_rows(crews, sep = ", ") |>
mutate(crew_name = str_extract(crews, "^[^(]+")) |>
group_by(crew_name) |>
summarise(avg_revenue = mean(revenue, na.rm = TRUE)) |>
arrange(desc(avg_revenue)) |>
slice_max(avg_revenue, n = 20, with_ties = FALSE)
# Plot
ggplot(top_crew_avg, aes(x = reorder(crew_name, avg_revenue), y = avg_revenue, fill = avg_revenue)) +
geom_bar(stat = "identity", alpha = 0.9) +
coord_flip() +
scale_fill_gradient(low = "#85C1E9", high = "#1B4F72") + # Light Blue → Deep Blue
theme_minimal() +
labs(
title = "Top 20 Crew Members by Average Movie Revenue",
x = "Crew Member",
y = "Average Revenue per Movie ($)",
fill = "Avg Revenue"
) +
scale_y_continuous(labels = scales::dollar_format()) +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "#154360"),
axis.text.x = element_text(angle = 30, hjust = 1, size = 12),
axis.text.y = element_text(size = 12, face = "bold")
)
# Extract top crew members by movie count
top_crew_counts <- movies |>
separate_rows(crews, sep = ", ") |>
mutate(crew_name = str_extract(crews, "^[^(]+")) |>
group_by(crew_name) |>
summarise(movie_count = n()) |>
arrange(desc(movie_count)) |>
slice_max(movie_count, n = 20, with_ties = FALSE)
# Plot
ggplot(top_crew_counts, aes(x = reorder(crew_name, movie_count), y = movie_count, fill = movie_count)) +
geom_bar(stat = "identity", alpha = 0.9) +
coord_flip() +
scale_fill_gradient(low = "#D7BDE2", high = "#6C3483") + # Light Purple → Deep Violet
theme_minimal() +
labs(
title = "Top 20 Crew Members by Number of Movies",
x = "Crew Member",
y = "Movie Count",
fill = "Movie Count"
) +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "#512E5F"),
axis.text.x = element_text(angle = 30, hjust = 1, size = 12),
axis.text.y = element_text(size = 12, face = "bold")
)
# Select only numerical columns
numeric_vars <- select(movies, revenue, popularity, budget, runtime, actor_avg_pop, crew_avg_pop)
# Compute correlation matrix
corr_matrix <- cor(numeric_vars, use = "complete.obs")
# Define a custom theme for better aesthetics
ggcorrplot(corr_matrix, method = "square",
colors = c("#1f77b4", "white", "#d62728"), # Blue (negative) → White (neutral) → Red (positive)
lab = TRUE, # Show correlation values
lab_size = 5, # Larger labels
outline.color = "black", # Borders for better contrast
ggtheme = theme_minimal()) + # Minimalist theme
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold", color = "#2c3e50"), # Elegant title
panel.grid.major = element_blank(), # Remove grid lines for a clean look
panel.border = element_blank(), # No outer border
axis.text.x = element_text(size = 12, angle = 45, hjust = 1, color = "#2c3e50", face = "bold"), # Rotated x-labels
axis.text.y = element_text(size = 12, color = "#2c3e50", face = "bold") # Bold y-labels
) +
labs(title = "Correlation Heatmap: Movie Revenue vs Predictors")
lm_model <- lm(log10(revenue) ~ log10(budget) + popularity + runtime + actor_avg_pop + crew_avg_pop, data = movies)
summary(lm_model)
##
## Call:
## lm(formula = log10(revenue) ~ log10(budget) + popularity + runtime +
## actor_avg_pop + crew_avg_pop, data = movies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.54952 -0.13534 -0.02981 0.12426 0.66769
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.5760463 0.1440520 45.650 < 2e-16 ***
## log10(budget) 0.2220063 0.0184891 12.007 < 2e-16 ***
## popularity 0.0050809 0.0004400 11.547 < 2e-16 ***
## runtime 0.0010715 0.0002969 3.609 0.000323 ***
## actor_avg_pop 0.0037404 0.0034444 1.086 0.277782
## crew_avg_pop -0.0033722 0.0112840 -0.299 0.765119
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.191 on 942 degrees of freedom
## Multiple R-squared: 0.3103, Adjusted R-squared: 0.3067
## F-statistic: 84.78 on 5 and 942 DF, p-value: < 2.2e-16
lm_model <- lm(
log10(revenue) ~ Action + Adventure + Fantasy + `Science Fiction` + Drama + Romance +
Animation + Comedy + Family + Thriller + Crime + History + Music + War +
Mystery + Horror + Western + Documentary,
data = movies
)
summary(lm_model)
##
## Call:
## lm(formula = log10(revenue) ~ Action + Adventure + Fantasy +
## `Science Fiction` + Drama + Romance + Animation + Comedy +
## Family + Thriller + Crime + History + Music + War + Mystery +
## Horror + Western + Documentary, data = movies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.45437 -0.14608 -0.03736 0.12388 0.83213
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.553324 0.025006 342.056 < 2e-16 ***
## Action 0.018428 0.018455 0.999 0.31829
## Adventure 0.095560 0.017211 5.552 3.68e-08 ***
## Fantasy 0.034539 0.019141 1.804 0.07149 .
## `Science Fiction` 0.047815 0.019700 2.427 0.01541 *
## Drama -0.026308 0.019756 -1.332 0.18331
## Romance -0.004239 0.024257 -0.175 0.86133
## Animation 0.078728 0.028879 2.726 0.00653 **
## Comedy -0.075516 0.018622 -4.055 5.43e-05 ***
## Family -0.022729 0.027720 -0.820 0.41245
## Thriller -0.032745 0.020138 -1.626 0.10428
## Crime -0.035385 0.024929 -1.419 0.15611
## History -0.087570 0.040227 -2.177 0.02974 *
## Music -0.005852 0.046740 -0.125 0.90039
## War 0.015510 0.042627 0.364 0.71605
## Mystery -0.031443 0.031308 -1.004 0.31548
## Horror -0.093029 0.034768 -2.676 0.00759 **
## Western -0.113331 0.080746 -1.404 0.16078
## Documentary -0.168344 0.151804 -1.109 0.26773
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2098 on 929 degrees of freedom
## Multiple R-squared: 0.1793, Adjusted R-squared: 0.1634
## F-statistic: 11.28 on 18 and 929 DF, p-value: < 2.2e-16
anova_model <- aov(log10(revenue) ~ season, data = movies)
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## season 3 0.93 0.30925 5.969 0.000496 ***
## Residuals 944 48.91 0.05181
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
str(movies)
## tibble [948 × 34] (S3: tbl_df/tbl/data.frame)
## $ movie_id : int [1:948] 19995 299534 76600 597 140607 299536 634649 1022789 135397 420818 ...
## $ title : chr [1:948] "Avatar" "Avengers: Endgame" "Avatar: The Way of Water" "Titanic" ...
## $ revenue : num [1:948] 2.92e+09 2.80e+09 2.32e+09 2.26e+09 2.07e+09 ...
## $ popularity : num [1:948] 27.7 63.2 39.5 44.4 16.4 ...
## $ budget : num [1:948] 3.46e+08 4.43e+08 4.89e+08 3.93e+08 3.27e+08 ...
## $ release_date : Date[1:948], format: "2009-12-15" "2019-04-24" ...
## $ runtime : int [1:948] 162 181 192 194 136 149 148 97 124 118 ...
## $ actors : chr [1:948] "Sam Worthington(2.8), Zoe Saldaña(9), Sigourney Weaver(3.6), Stephen Lang(1.7), Michelle Rodriguez(4.6)" "Robert Downey Jr.(11.1), Chris Evans(7.5), Mark Ruffalo(3.4), Chris Hemsworth(13.5), Scarlett Johansson(20)" "Sam Worthington(2.8), Zoe Saldaña(9), Sigourney Weaver(3.6), Stephen Lang(1.7), Kate Winslet(6.9)" "Leonardo DiCaprio(8.8), Kate Winslet(6.9), Billy Zane(1.7), Kathy Bates(2.3), Frances Fisher(2.1)" ...
## $ crews : chr [1:948] "James Cameron(2.9), James Cameron(2.9), Ilram Choi(0.1), Woody Schultz(0.1), Mauro Fiore(0.1)" "Paul Schneider(0.1), Louis D'Esposito(0.5), Carlos Pacheco(0.1), John David Duncan(0.1), Christopher Kelly(0.2)" "James Cameron(2.9), James Cameron(2.9), James Cameron(2.9), Michele Perry(0.1), Carly Marr(0)" "Martin Laing(0.1), Russell Carpenter(0.2), James Cameron(2.9), Giedra Rackauskas(0), Peter Lamont(0.2)" ...
## $ actor_avg_pop : num [1:948] 4.33 11.11 4.8 4.35 2.77 ...
## $ crew_avg_pop : num [1:948] 1.231 0.198 1.777 0.691 0.904 ...
## $ release_year : num [1:948] 2009 2019 2022 1997 2015 ...
## $ season : Factor w/ 4 levels "Fall","Spring",..: 4 2 4 1 4 2 4 3 3 3 ...
## $ Action : num [1:948] 1 1 1 0 1 1 1 0 1 0 ...
## $ Adventure : num [1:948] 1 1 1 0 1 1 1 1 1 1 ...
## $ Fantasy : num [1:948] 1 0 0 0 0 0 0 0 0 0 ...
## $ Science Fiction : num [1:948] 1 1 1 0 1 1 1 0 1 0 ...
## $ Drama : num [1:948] 0 0 0 1 0 0 0 0 0 1 ...
## $ Romance : num [1:948] 0 0 0 1 0 0 0 0 0 0 ...
## $ Animation : num [1:948] 0 0 0 0 0 0 0 1 0 1 ...
## $ Comedy : num [1:948] 0 0 0 0 0 0 0 1 0 0 ...
## $ Family : num [1:948] 0 0 0 0 0 0 0 1 0 1 ...
## $ Thriller : num [1:948] 0 0 0 0 0 0 0 0 1 0 ...
## $ Crime : num [1:948] 0 0 0 0 0 0 0 0 0 0 ...
## $ History : num [1:948] 0 0 0 0 0 0 0 0 0 0 ...
## $ Music : num [1:948] 0 0 0 0 0 0 0 0 0 0 ...
## $ War : num [1:948] 0 0 0 0 0 0 0 0 0 0 ...
## $ Mystery : num [1:948] 0 0 0 0 0 0 0 0 0 0 ...
## $ Horror : num [1:948] 0 0 0 0 0 0 0 0 0 0 ...
## $ Western : num [1:948] 0 0 0 0 0 0 0 0 0 0 ...
## $ Documentary : num [1:948] 0 0 0 0 0 0 0 0 0 0 ...
## $ release_month : num [1:948] 12 4 12 11 12 4 12 6 6 7 ...
## $ observation_date: chr [1:948] "2009-12-01" "2019-04-01" "2022-12-01" "1997-11-01" ...
## $ CPIAUCSL : num [1:948] 217 255 299 162 238 ...
rf_data <- movies |>
mutate(
log_revenue = log10(revenue),
log_budget = log10(budget),
) |>
rename(Science_Fiction = `Science Fiction`) |>
drop_na() # Ensure no missing values
n <- nrow(rf_data)
train_idx <- sample(1:n, size = 0.8 * n)
train <- rf_data[train_idx, ]
test <- rf_data[-train_idx, ]
rf_model <- randomForest(
log_revenue ~
log_budget + popularity + runtime + actor_avg_pop + crew_avg_pop + season +
Adventure + Fantasy + Science_Fiction + Animation + Action + Family + War +
Thriller + Comedy + Romance + Mystery + Drama + Music + Crime + Horror +
Western + Documentary + History,
data = train,
ntree = 500,
importance = TRUE
)
print(rf_model)
##
## Call:
## randomForest(formula = log_revenue ~ log_budget + popularity + runtime + actor_avg_pop + crew_avg_pop + season + Adventure + Fantasy + Science_Fiction + Animation + Action + Family + War + Thriller + Comedy + Romance + Mystery + Drama + Music + Crime + Horror + Western + Documentary + History, data = train, ntree = 500, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 8
##
## Mean of squared residuals: 0.03224818
## % Var explained: 41.25
pred <- predict(rf_model, newdata = test)
true <- test$log_revenue
r_squared <- 1 - sum((pred - true)^2) / sum((true - mean(true))^2)
rmse <- sqrt(mean((pred - true)^2))
cat("R-squared:", round(r_squared, 3), "\n")
## R-squared: 0.446
cat("RMSE:", round(rmse, 3), "\n")
## RMSE: 0.152
varImpPlot(rf_model, main = "Random Forest Feature Importance")
rf_predict <- function(newdata) {
log_prediction <- predict(rf_model, newdata)
10^log_prediction # Convert back to original scale
}
evaluate_model_predictions <- function(model, test_data, title_col = "title", revenue_col = "revenue") {
test <- test_data %>%
mutate(
predicted_log_revenue = predict(model, newdata = test_data),
actual_log_revenue = log10(.data[[revenue_col]]),
residual = actual_log_revenue - predicted_log_revenue
)
top100 <- test %>%
arrange(desc(.data[[revenue_col]])) %>%
slice_head(n = 100) %>%
mutate(
title = fct_reorder(.data[[title_col]], actual_log_revenue),
label_actual = paste0("Title: ", .data[[title_col]],
"\nActual log-revenue: ", round(actual_log_revenue, 2)),
label_pred = paste0("Title: ", .data[[title_col]],
"\nPredicted log-revenue: ", round(predicted_log_revenue, 2))
)
return(top100)
}
plot_prediction_comparison <- function(top100_df) {
gg <- ggplot(top100_df, aes(x = title)) +
geom_point(aes(y = actual_log_revenue, text = label_actual), color = "black") +
geom_point(aes(y = predicted_log_revenue, text = label_pred), color = "red") +
geom_segment(aes(y = predicted_log_revenue, yend = actual_log_revenue, xend = title),
color = "gray", linetype = "dashed") +
coord_flip() +
theme_minimal() +
labs(
title = "Actual vs Predicted Log Revenue (Top 100 Grossing Films)",
x = "Movie Title",
y = "Log10(Revenue)"
)
ggplotly(gg, tooltip = "text", height = 1500, width = 900)
}
top100_rf <- evaluate_model_predictions(rf_model, test)
plot_prediction_comparison(top100_rf)
## Warning in geom_point(aes(y = actual_log_revenue, text = label_actual), :
## Ignoring unknown aesthetics: text
## Warning in geom_point(aes(y = predicted_log_revenue, text = label_pred), :
## Ignoring unknown aesthetics: text
xgb_data <- movies %>%
mutate(
log_budget = log10(budget),
log_revenue = log10(revenue)
) %>%
select(
-c(movie_id, title, release_date, actors, crews, budget, revenue,
release_month, release_year, observation_date, CPIAUCSL)
) %>%
drop_na()
# Convert categorical/factor to dummies (season and genres)
xgb_matrix <- model.matrix(~ . - log_revenue, data = xgb_data)[, -1] # remove intercept
# Labels
xgb_labels <- xgb_data$log_revenue
# Train/test split
set.seed(42)
n <- nrow(xgb_matrix)
train_idx <- sample(1:n, size = 0.8 * n)
xgb_train <- xgb_matrix[train_idx, ]
xgb_test <- xgb_matrix[-train_idx, ]
y_train <- xgb_labels[train_idx]
y_test <- xgb_labels[-train_idx]
xgb_model <- xgboost(
data = xgb_train,
label = y_train,
nrounds = 100,
objective = "reg:squarederror",
max_depth = 6,
eta = 0.1,
subsample = 0.8,
colsample_bytree = 0.8,
verbose = 0
)
xgb_pred <- predict(xgb_model, newdata = xgb_test)
r_squared_xgb <- 1 - sum((xgb_pred - y_test)^2) / sum((y_test - mean(y_test))^2)
rmse_xgb <- sqrt(mean((xgb_pred - y_test)^2))
cat("XGBoost R-squared:", round(r_squared_xgb, 3), "\n")
## XGBoost R-squared: 0.414
cat("XGBoost RMSE:", round(rmse_xgb, 3), "\n")
## XGBoost RMSE: 0.157
importance <- xgb.importance(model = xgb_model)
xgb.plot.importance(importance_matrix = importance, top_n = 20)
## Function to predict revenue on original scale using XGBoost
predict_movie_revenue_xgb <- function(newdata_matrix) {
log_pred <- predict(xgb_model, newdata = newdata_matrix)
10^log_pred
}
evaluate_model_predictions_xgb <- function(model, test_matrix, original_df, title_col = "title", revenue_col = "revenue") {
# Make predictions
predicted_log_revenue <- predict(model, newdata = test_matrix)
# Add predictions back to the original test data
test <- original_df %>%
filter(complete.cases(.)) %>%
slice(-train_idx) %>% # Use the same test rows
mutate(
predicted_log_revenue = predicted_log_revenue,
actual_log_revenue = log10(.data[[revenue_col]]),
residual = actual_log_revenue - predicted_log_revenue
) %>%
arrange(desc(.data[[revenue_col]])) %>%
slice_head(n = 100) %>%
mutate(
title = fct_reorder(.data[[title_col]], actual_log_revenue),
label_actual = paste0("Title: ", .data[[title_col]],
"\nActual log-revenue: ", round(actual_log_revenue, 2)),
label_pred = paste0("Title: ", .data[[title_col]],
"\nPredicted log-revenue: ", round(predicted_log_revenue, 2))
)
return(test)
}
top100_xgb <- evaluate_model_predictions_xgb(
model = xgb_model,
test_matrix = xgb_test,
original_df = movies
)
plot_prediction_comparison(top100_xgb)
## Warning in geom_point(aes(y = actual_log_revenue, text = label_actual), :
## Ignoring unknown aesthetics: text
## Warning in geom_point(aes(y = predicted_log_revenue, text = label_pred), :
## Ignoring unknown aesthetics: text